In [1]:
from IPython.display import Audio
sound_file = 'beep-05.wav'

ROP Data Analysis 0.2


In [2]:
# Import Modules and Identify Low Signals
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd
from datetime import datetime, timedelta
from pandas import Series, DataFrame, concat
from scipy import stats, fftpack
from IPython.display import display
import sys
import os, errno


#Define any string with 'C' as NaN. 'C' is an indicator for the Masimo as a low signal reading
def readD(val):
    if 'C' in val:
        return np.nan
    return val

%matplotlib inline

In [3]:
Baby_lst = ['ROP003', 'ROP005', 'ROP006', 'ROP007', 'ROP008', 'ROP009', 'ROP010',
          'ROP011', 'ROP012', 'ROP013', 'ROP014', 'ROP015', 'ROP017', 'ROP018',
          'ROP019', 'ROP020', 'ROP021', 'ROP022', 'ROP023', 'ROP025']

#update this as you go

In [4]:
import ROPtimes as rt

Baby = 'ROP003' #change depending on which baby you're doing!
BabyDict = getattr(rt, Baby)

In [5]:
def mkdir_p(path):
    """"
    This function creates a folder at of the given path, unless the folder already exsists. 
    Parameters
    ----------
    path : string,
        full file path or relative file path.
    Returns
    -------
    None
    Notes
    -----
    This does not check that the file path makes sense or is formatted correctly for OS.
    Examples
    --------
    mkdir_p(User/me/my/path)
    References
    ----------
    .. [1] http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
    """
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

In [6]:
mkdir_p('Data/'+Baby)
mkdir_p('Data/'+Baby+'/plots') #makes a plots folder if does not exist
mkdir_p('Data/'+Baby+'/csv') #makes a csv folder if does not exist

Load Data


In [7]:
dfNIRSpre0 = pd.read_csv('/Users/John/Dropbox/LLU/Projects/Retinopathy of Prematurity/NIRS/Clean/' + Baby + 'NIRS.csv',
                         parse_dates={'timestamp': ['Date',' Time']},
                         index_col='timestamp',
                         usecols=['Date', ' Time', ' Ch2 %StO2'],
                         na_values=['0'])

dfPOpre0 = pd.read_csv('/Users/John/Dropbox/LLU/Projects/Retinopathy of Prematurity/Pulse Ox/' + Baby + 'PO.csv',
                       parse_dates={'timestamp': ['Date','Time']},
                       index_col='timestamp',
                       usecols=['Date', 'Time', 'SpO2', 'PR'],
                       na_values=['0'])

dfPIpre0 = pd.read_csv('/Users/John/Dropbox/LLU/Projects/Retinopathy of Prematurity/Pulse Ox/' + Baby + 'PO.csv',
                       parse_dates={'timestamp': ['Date','Time']},
                           #Parse_dates tells the read_csv function to combine the date and time column
                       index_col='timestamp',
                       usecols=['Date', 'Time', 'PI', 'Exceptions'],
                       na_values=['0'], #Na_values is used to turn 0 into NaN
                       converters={'Exceptions':  readD}
                           #Converters: readD is the dict that means any string with 'C' with be NaN (for PI)
                       ) #

dfNIRSpre = dfNIRSpre0.rename(columns={' Ch2 %StO2': 'StO2'}) #rename NIRS column

#Clean the PI dataframe to get rid of rows that have NaN
dfPIpre = dfPIpre0.dropna(how='any')

dfPOpre = dfPIpre.combine_first(dfPOpre0)
#combine PI dataframe with the rest of the Pulse Ox dataframe after cleaning

Clean Data

- Remove all 0's and set as NaN
- Remove all PI values with low signal integrity

In [8]:
# Phone almost always equals Philips monitor

# NIRS date/time is 2 mins and 10 seconds slower than phone. Have to correct for it.
# Pulse ox date/time is 1 mins and 32 seconds faster than phone. Have to correct for it.

#This code is to make the combined dataframe come in even seconds.
#The corrected is 1 second, + for NIRS and - for PO.

TCcorrect = timedelta(seconds=1)

ncorr = dfNIRSpre.index+TCcorrect
pcorr = dfPOpre.index-TCcorrect


if dfNIRSpre.index[:1].second % 2 == 0:
    if dfPOpre.index[:1].second % 2 == 0:
        print '\nBoth NIRS and PO indices are even.'
    elif dfPOpre.index[:1].second % 2 != 0:
        print '\nNIRS even, PO odd. PO index corrected.'
        dfPOpre = dfPOpre.set_index(pcorr)
    else:
        raise NameError('Indices are messed up')
elif dfNIRSpre.index[:1].second % 2 != 0:
    if dfPOpre.index[:1].second % 2 == 0:
        print '\nNIRS odd, PO even. NIRS index corrected.'
        dfNIRSpre = dfNIRSpre.set_index(ncorr)
    elif dfPOpre.index[:1].second % 2 != 0:
        print '\nBoth NIRS and PO indices are odd. Both corrected'
        dfNIRSpre = dfNIRSpre.set_index(ncorr)
        dfPOpre = dfPOpre.set_index(pcorr)
else:
    raise NameError('Indices are messed up')

TCnirs = timedelta(minutes=2, seconds=10)
TCpo = timedelta(minutes=1, seconds=32)
# NIRS is slower than correct time, need to add TCnirs to catch it up
# PO is faster than correct time, need to subtract TCpo to slow it down

dfNIRS = dfNIRSpre.set_index([dfNIRSpre.index+TCnirs]) #NIRS Time Correction
dfPO = dfPOpre.set_index([dfPOpre.index-TCpo]) #PO Time Correction


Both NIRS and PO indices are odd. Both corrected

In [9]:
#for babies that only had one machine

dffakePO = pd.DataFrame({'SpO2':0, 'PR':0, 'PI':0}, index=dfNIRS.index)
dffakeNIRS = pd.DataFrame({'StO2':0}, index=dfPO.index)

if len(dfNIRS) > 5:
    if len(dfPO) > 5:
        df = dfNIRS.combine_first(dfPO) #Combine two DataFrame objects and default to non-null values in frame
        Masimo = True
        NIRS = True
        print 'Both machines on'
    elif len(dfPO) < 5:
        df = dfNIRS.combine_first(dffakePO)
        print 'Only NIRS on'
        NIRS = True
        Masimo = False
elif len(dfNIRS) < 5:
    df = dffakeNIRS.combine_first(dfPO)
    Masimo = True
    NIRS = False
    print 'Only Masimo on'
else:
    raise NameError('Check your files')


Only NIRS on

In [10]:
# save files

def saveplot(filename):
    plt.savefig('Data/'+Baby+'/plots/'+'BDA_'+filename+'.pdf')
    
def savecsv(filename, df):
    df.to_csv('Data/'+Baby+'/csv/'+'BDA_'+filename+'.csv')

In [11]:
blr = -(df.index[0]-BabyDict['ExamStart']).total_seconds()*0.000277778
print 'Total (h) Baseline recording = %s' % blr
aer = ((df.index[-1])-BabyDict['ExamEnd']).total_seconds()*0.000277778
print 'Total (h) After Exam recording = %s' % aer


Total (h) Baseline recording = 2.240001792
Total (h) After Exam recording = 25.115020092

In [12]:
#Convert to Relative Time and Time Windows
def reltimecalc(ROPNumber, df):
    ROPreltime = {
        'BaselineStart' : (df.index[0].to_datetime()-ROPNumber['ExamStart']).total_seconds(), #should be negative
        'BaselineEnd': (ROPNumber['EyeDrop1'] - ROPNumber['ExamStart']).total_seconds(), #redundant, just for labeling sake
        'EyeDrop1' : (ROPNumber['EyeDrop1'] - ROPNumber['ExamStart']).total_seconds(), #neg
        'EyeDrop2' : (ROPNumber['EyeDrop2'] - ROPNumber['ExamStart']).total_seconds(), #neg
        'EyeDrop3' : (ROPNumber['EyeDrop3'] - ROPNumber['ExamStart']).total_seconds(), #neg
        'ExamStart' : (ROPNumber['ExamStart'] - ROPNumber['ExamStart']).total_seconds(), 
        'ExamEnd' :  (ROPNumber['ExamEnd'] - ROPNumber['ExamStart']).total_seconds(), #positive number
        'DataEnd' : (df.index[-1].to_datetime()-ROPNumber['ExamStart']).total_seconds()
    }
    
    TimeWindows = {
        'tw0' : -10800.0, # three hours before 0
        'tw1' : 10800.0, # three hours after 0
        'tw2' : 21600.0, # six hoours
        'tw3' : 32400.0, # nine hours
        'tw4' : 43200.0, # 12 hours
        'tw5' : 54000.0, # 15 hours
        'tw6' : 64800.0, # 18 hours
        'tw7' : 75600.0, # 21 hours
        'tw8' : 86400.0 #24 hours
    }
    
    return ROPreltime, TimeWindows

In [13]:
tw = timedelta(hours=3)
twend = timedelta(hours=24)
df = df[(BabyDict['ExamStart']-tw):(BabyDict['ExamStart']+twend)]

In [14]:
Data, tw = reltimecalc(BabyDict, df)

In [15]:
#if after exam recording doesn't reach 24 hours, create fake rows
#necessary to make relative time accurate

dfnan = pd.DataFrame({'Exceptions' : np.NaN}, index = [0])

while len(df) < 48601:
    if len(df) == 48601:
        break
    else:
        df = df.append(dfnan)

In [16]:
df = df.set_index(np.linspace(tw['tw0'],tw['tw8'],len(df)))

In [17]:
if Masimo != False:    
    df = df[df.PI.between(0.02, 20)]
    df = df.drop('Exceptions', 1) # Drop exceptions now that you don't need it anymore
    #filter PI values to only allow  =< 0.02 and >= 20

In [18]:
if Masimo != False and NIRS != False:
    dfpreFTOE = pd.DataFrame({'SpO2' : df['SpO2'].values, 'StO2' : df['StO2'].values}, index=df.index)
    dfpreFTOE = dfpreFTOE.dropna(how='any')

    psa = dfpreFTOE['SpO2']
    prs = dfpreFTOE['StO2']

    dfFTOE = pd.DataFrame({'FTOE' : (((psa-prs)/psa)*100)})

    df = df.join(dfFTOE)

Analysis Start

Linear Measures:

Avgs, Std Dev, and Coefficient of Variation

- Avg, std dev, and CoV for every time window
- Avg every 10 sec for 4 mins of ROP exam, std dev, CoV.
- O2 DeSat Counts

In [19]:
if Masimo == False:
    df = df[['StO2']]
elif NIRS == False:
    df = df.drop(['StO2'], axis=1)
else:
    df = df

In [20]:
#create easy variables for different time epochs



a = df[tw['tw0']:0] #before
b = df[0:240] #during
c = df[240:tw['tw4']] #after 12
d = df[240:tw['tw8']] #after 24

if aer <= 12:
    d = df[tw['tw4']:tw['tw8']]
    d[d > 0] = np.NaN
    
    
dflst = [a, b, c, d] #list of dataframes

twind = ['Before','During','12H After','24H After'] #list of time windows

if Masimo == False:
    varlst = ['StO2']
elif NIRS == False:
    varlst = ['PI', 'PR', 'SpO2']
else:    
    varlst = ['PI', 'PR', 'SpO2', 'StO2', 'FTOE'] #list of variables

In [21]:
def linearmeasures(data): 
    
    if type(data) == pd.core.series.Series:
        a = data.values
    
    avg = np.nanmean(data) #mean
    stdev = np.nanstd(data) #std dev
    cv = stats.variation(data, nan_policy='omit') * 100 #%CoV
    
    return avg, stdev, cv

In [22]:
def dflinmeasure(variable, graph=True, table=True):
    
    datalst = []
    
    for i in dflst:
        x = linearmeasures(i[variable])
        datalst.append(x)
        
    dfresult = pd.DataFrame(datalst, columns=[variable + ' mean', variable + ' std dev', variable + ' % CV'], index=twind)
    
    dfresult.index.name = 'Time Windows (h)'
    
    if graph == True:
        
        dfresult[variable + ' mean'].plot(yerr=dfresult[variable +' std dev'],
                                          color='b', ecolor='r') 

        plt.title(Baby + ' ' + variable + ' Mean with Std Dev')
        plt.xlabel('Time Windows')
        plt.ylabel(variable)
        plt.xlim(-1,9)
        plt.xticks(np.arange(0, 9), twind, rotation='vertical')
        
        saveplot(Baby + variable + '_mean_stddev_24hrs')

        plt.show()

    if table == True:
        
        savecsv(Baby + variable + '_mean_stddev_24hrs', dfresult)
        
        display(dfresult)
        
    return dfresult

In [23]:
if Masimo != False and NIRS !=False:
    PIlin = dflinmeasure('PI')
    PRlin = dflinmeasure('PR')
    SpO2lin = dflinmeasure('SpO2')
    StO2lin = dflinmeasure('StO2')
    FTOElin = dflinmeasure('FTOE')
elif Masimo != True and NIRS == True:
    StO2lin = dflinmeasure('StO2')
elif Masimo == True and NIRS != True:
    PIlin = dflinmeasure('PI')
    PRlin = dflinmeasure('PR')
    SpO2lin = dflinmeasure('SpO2')


StO2 mean StO2 std dev StO2 % CV
Time Windows (h)
Before 87.827307 3.128397 3.561986
During 88.115702 2.796663 3.173853
12H After 84.732780 5.088882 6.005801
24H After 84.212759 4.872903 5.786418

Information Theory

- Mutual Information
- Fuzzy Entropy
- Cross Fuzzy Entropy

In [24]:
from sklearn.metrics import mutual_info_score
#sklearn.metrics.mutual_info_score(labels_true, labels_pred, contingency=None)

def MutualInfo(var1, var2, graph = True, table = True):

    datalst = []
    
    for i in dflst:
        
        z = i.dropna(how='any') #drop before splitting variables to make len equal
        data1 = z[var1]
        data2 = z[var2]
        
        try:
            x = mutual_info_score(data1, data2)
            datalst.append(x)
        except ValueError:
            datalst.append(np.nan)
            
    dfresult = pd.DataFrame(datalst, columns=['Mutual Info Score ' + var1 + 'x' + var2], index=twind)
    dfresult.index.name = 'Time Windows (h)'
    
    if graph == True:
        
        dfresult.plot(color='k', grid=True) 
        plt.title(Baby + ' Mutual Info Score Over Time Between ' + var1 + ' and ' +var2)
        plt.xlabel('Time Windows (h)')
        plt.ylabel(var1 + 'x' + var2 +' Mutual Info Score')

        saveplot(Baby + var1 + 'x' + var2 + 'mutualinfo')
        
        plt.show()
        
    if table == True:
        
        savecsv(Baby + var1 + 'x' + var2 + 'mutualinfo', dfresult)
        
        display(dfresult)
        
    return dfresult    

def MutualInfoLst(variable, graph=True, table=True): #cross mutualinfo with the whole list
    for i in varlst:
        x = MutualInfo(variable, i)
    return x

In [25]:
if Masimo != True:
    MutualInfo('StO2', 'StO2')
else:
    for i in varlst:
        MutualInfoLst(i)


Mutual Info Score StO2xStO2
Time Windows (h)
Before 2.488380
During 2.292050
12H After 2.943990
24H After 2.947833

In [26]:
from entropy import *

#def fuzzyen(x, dim, r, n, scale=True):
#    return entropy(x, dim, r, n=n, scale=scale, remove_baseline=True)
# n is "scale of fuzziness"

def FuzzyEnMeasure(variable, graph = True, table = True):  

    datalst = []
    
    for i in dflst:
        data = i[variable]
        data = data.dropna(how='any') #drop NaNs in series
        try:
            x = fuzzyen(data, 2, .2*np.nanstd(data), n=1) #run FuzzyEn
            datalst.append(x)
        except ValueError:
            datalst.append(np.nan)
        
    dfresult = pd.DataFrame(datalst, columns=[variable + ' FuzzyEn'], index=twind)
    dfresult.index.name = 'Time Windows (h)'
    
    if graph == True:
        
        dfresult[variable + ' FuzzyEn'].plot(color='k', grid=True) 
        plt.title(Baby + variable + ' FuzzyEn over time')
        plt.xlabel('Time Windows (h)')
        plt.ylabel(variable + ' FuzzyEn')
        
        saveplot(Baby + variable + '_FuzzyEn')
        
        plt.show()
        
    if table == True:
        
        savecsv(Baby + variable + '_FuzzyEn', dfresult)
        
        display(dfresult)
        
    return dfresult

In [27]:
if Masimo != False and NIRS !=False:
    PIFuzzyEn = FuzzyEnMeasure('PI')
    PRFuzzyEn = FuzzyEnMeasure('PR')
    SpO2FuzzyEn = FuzzyEnMeasure('SpO2')
    StO2FuzzyEn = FuzzyEnMeasure('StO2')
    FTOEFuzzyEn = FuzzyEnMeasure('FTOE')
elif Masimo != True and NIRS == True:
    StO2FuzzyEn = FuzzyEnMeasure('StO2')
elif Masimo == True and NIRS != True:
    PIFuzzyEn = FuzzyEnMeasure('PI')
    PRFuzzyEn = FuzzyEnMeasure('PR')
    SpO2FuzzyEn = FuzzyEnMeasure('SpO2')


StO2 FuzzyEn
Time Windows (h)
Before 0.163553
During 0.201589
12H After 0.078530
24H After 0.082570

In [28]:
def CrossFuzzyMeasures(var1, var2, graph = True, table = True):

    datalst = []
    
    for i in dflst:
        z = i.dropna(how='any') #drop before splitting variables to make len equal
        data1 = z[var1]
        data2 = z[var2]
        
        try:
            x = cross_fuzzyen(data1, data2, 2, .2, n=1)
            datalst.append(x)
        except ValueError:
            datalst.append(np.nan)
        
    dfresult = pd.DataFrame(datalst, columns=['Cross FuzzyEn ' + var1 + 'x' + var2], index=twind)
    dfresult.index.name = 'Time Windows (h)'
    
    if graph == True:
        
        dfresult.plot(color='k', grid=True) 
        plt.title(Baby + ' Cross FuzzyEn Over Time Between ' + var1 + ' and ' +var2)
        plt.xlabel('Time Windows (h)')
        plt.ylabel(var1 + 'x' + var2 +' Cross FuzzyEn')
        
        saveplot(Baby + var1 + 'x' + var2 + '_XFuzzyEn')

        plt.show()
        
    if table == True:
        
        savecsv(Baby + var1 + 'x' + var2 + '_XFuzzyEn', dfresult)
        
        display(dfresult)
        
    return dfresult    

def CrossFuzzyLst(variable, graph=True, table=True):
    for i in varlst:
        x = CrossFuzzyMeasures(variable, i)
    return x

In [29]:
#run all CrossFuzzyLst
if Masimo != True:
    CrossFuzzyMeasures('StO2', 'StO2')
else:
    for i in varlst:
        CrossFuzzyLst(i)


Cross FuzzyEn StO2xStO2
Time Windows (h)
Before 0.383699
During 0.392423
12H After 0.324720
24H After 0.326201

Chaos Theory

-Lempel-Ziv complexity
-Largest Lyapunov exponent
-Hurst exponent

In [30]:
# Scripts for Chaos Theory based variables
# Import from other scientists
#

# #1
from analysis import *
    #Source: https://github.com/thelahunginjeet/pydynet/blob/master/analysis.py
    #def lz_complexity(s):
    #Aboy 2006: turn into two bit data based off threshold (threshold = median)
    #turn data into a string of 0's and 1's

# #2
def LLEcal(data):
    #Source: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html
    result = []
    lambdas = []
    
    # loop through data
    for x in data:
        #take the log of the absolute of the data
        result.append(np.log(abs(x))) #np.log = ln
        # take average
        lambdas.append(mean(result))
    
    return max(lambdas)


# #3
def hurst_mod(data):
    
    #Source:  http://pyeeg.sourceforge.net/index.html?highlight=hurst#pyeeg.hurst
    #modified to drop NaNs
    
    X = data
    N = len(X)

    T = array([float(i) for i in xrange(1,N+1)])
    
    Q = pd.Series(X)
    Y = Q.cumsum()
    
    Ave_T = Y.values/T

    S_T = zeros((N))
    R_T = zeros((N))
    
    for i in xrange(N):
        S_T[i] = std(X[:i+1])
        X_T = Y - T * Ave_T[i]
        R_T[i] = max(X_T[:i + 1]) - min(X_T[:i + 1])

    R_S = R_T / S_T
    R_S = log(R_S)
    n = log(T).reshape(N, 1)
    
    #remove NaNs from both R_S and n
    rrr = np.ndarray.flatten(n)

    dfxxx = pd.DataFrame({
    'n': rrr,
    'R_S' : R_S,
    })
    
    dfxxx = dfxxx.dropna(how='any')
    
    N = len(dfxxx)
    
    n = dfxxx['n'].values.reshape(N, 1)
    R_S = dfxxx['R_S'].values

    H = lstsq(n[1:], R_S[1:])[0]
    return H[0]

In [31]:
def LZ(var, graph=True, table=True):
    
    LZlst = []
    
    for i in dflst:

        try:
            data = i[var]
            datalz = []
            thresh = np.nanmedian(data)
            
            if data.isnull().sum() == len(data): #if all NaN
                LZlst.append(np.nan)
                pass
            
            else:

                for i in data:
                    if i < thresh:
                        datalz.append(0)
                    else:
                        datalz.append(1)
                strlz = ''.join(str(x) for x in datalz)

                a = ((1.0*lz_complexity(strlz))/random_lz_complexity(len(strlz),p=0.5)) #normalize
                LZlst.append(a)
            
        except IndexError:
            LZlst.append(np.nan)
    
    dfresult = pd.DataFrame({var + ' LZ': LZlst}, index=twind)
    
    if graph == True:
        
        dfresult.plot(color='k', grid=True) 
        plt.title(Baby + ' Lempel-Ziv Complexity of '+var+' over time')
        plt.xlabel('Time Windows (h)')
        plt.ylabel('Lempel-Ziv Complexity')
        
        saveplot(Baby + var + '_LZComp')

        plt.show()
    
    if table == True:
        
        savecsv(Baby + var + '_LZComp', dfresult)
        
        display(dfresult)
    
    return dfresult

In [32]:
def LLE(var, graph=True, table=True):
    
    LLElst = []
    
    for i in dflst:
        data = i[var].values
        
        try:
            b = LLEcal(data)
            LLElst.append(b)
        except ValueError:
            LLElst.append(np.nan)

    dfresult = pd.DataFrame({var + ' LLE': LLElst}, index=twind)
    
    if graph == True:
        
        dfresult.plot(color='k', grid=True) 
        plt.title(Baby + ' Largest Lyapunov Exponent of '+var+' over time')
        plt.xlabel('Time Windows (h)')
        plt.ylabel('Largest Lyapunov Exponent')

        saveplot(Baby + var + '_LLE')
        
        plt.show()
    
    if table == True:
        
        savecsv(Baby + var + '_LLE', dfresult)
        
        display(dfresult)
    
    return dfresult

In [33]:
def Hurst(var, graph=True, table=True):
    
    Hlst = []
    
    for i in dflst:
        
        data = i[var]
        data = data.dropna(how='any')
        data = data.values
        
        try:
            d = hurst_mod(data)
            Hlst.append(d)
        except ValueError:
            Hlst.append(np.nan)
    
    dfresult = pd.DataFrame({var + ' Hurst Exp': Hlst}, index=twind)
    
    if graph == True:
        
        dfresult.plot(color='k', grid=True) 
        plt.title(Baby + ' Hurst Exponent of '+var+' over time')
        plt.xlabel('Time Windows (h)')
        plt.ylabel('Hurst Exponent')

        saveplot(Baby + var + '_Hurst')
        
        plt.show()
    
    if table == True:
        
        savecsv(Baby + var + '_Hurst', dfresult)
        
        display(dfresult)
    
    return dfresult

In [34]:
for i in varlst:
    LZ(i)
    LLE(i)
    Hurst(i)
        
Audio(url=sound_file, autoplay=True)


StO2 LZ
Before 0.188246
During 0.285903
12H After 0.156764
24H After 0.159733
StO2 LLE
Before 4.493425
During 4.499810
12H After 4.510860
24H After 4.510860
StO2 Hurst Exp
Before 0.788796
During 0.763208
12H After 0.866750
24H After 0.856010
Out[34]:

In [ ]: